import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
data_df = pd.read_csv("./Shopping_Daten.csv")
data_df.head()
| Unnamed: 0 | CustomerID | Gender | Location | Tenure_Months | Transaction_ID | Transaction_Date | Product_SKU | Product_Description | Product_Category | ... | Avg_Price | Delivery_Charges | Coupon_Status | GST | Date | Offline_Spend | Online_Spend | Month | Coupon_Code | Discount_pct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 17850.0 | M | Chicago | 12.0 | 16679.0 | 2019-01-01 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | ... | 153.71 | 6.5 | Used | 0.1 | 1/1/2019 | 4500.0 | 2424.5 | 1 | ELEC10 | 10.0 |
| 1 | 1 | 17850.0 | M | Chicago | 12.0 | 16680.0 | 2019-01-01 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | ... | 153.71 | 6.5 | Used | 0.1 | 1/1/2019 | 4500.0 | 2424.5 | 1 | ELEC10 | 10.0 |
| 2 | 2 | 17850.0 | M | Chicago | 12.0 | 16696.0 | 2019-01-01 | GGOENEBQ078999 | Nest Cam Outdoor Security Camera - USA | Nest-USA | ... | 122.77 | 6.5 | Not Used | 0.1 | 1/1/2019 | 4500.0 | 2424.5 | 1 | ELEC10 | 10.0 |
| 3 | 3 | 17850.0 | M | Chicago | 12.0 | 16699.0 | 2019-01-01 | GGOENEBQ079099 | Nest Protect Smoke + CO White Battery Alarm-USA | Nest-USA | ... | 81.50 | 6.5 | Clicked | 0.1 | 1/1/2019 | 4500.0 | 2424.5 | 1 | ELEC10 | 10.0 |
| 4 | 4 | 17850.0 | M | Chicago | 12.0 | 16700.0 | 2019-01-01 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | ... | 153.71 | 6.5 | Clicked | 0.1 | 1/1/2019 | 4500.0 | 2424.5 | 1 | ELEC10 | 10.0 |
5 rows × 21 columns
profile = pandas_profiling.ProfileReport(data_df, title='Pandas Profiling Report', explorative=True)
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
data_df.shape
(52955, 21)
data_df.isna().sum()
Unnamed: 0 0 CustomerID 31 Gender 31 Location 31 Tenure_Months 31 Transaction_ID 31 Transaction_Date 31 Product_SKU 31 Product_Description 31 Product_Category 0 Quantity 31 Avg_Price 31 Delivery_Charges 31 Coupon_Status 31 GST 31 Date 31 Offline_Spend 31 Online_Spend 31 Month 0 Coupon_Code 400 Discount_pct 400 dtype: int64
data_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 52955 entries, 0 to 52954 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 52955 non-null int64 1 CustomerID 52924 non-null float64 2 Gender 52924 non-null object 3 Location 52924 non-null object 4 Tenure_Months 52924 non-null float64 5 Transaction_ID 52924 non-null float64 6 Transaction_Date 52924 non-null object 7 Product_SKU 52924 non-null object 8 Product_Description 52924 non-null object 9 Product_Category 52955 non-null object 10 Quantity 52924 non-null float64 11 Avg_Price 52924 non-null float64 12 Delivery_Charges 52924 non-null float64 13 Coupon_Status 52924 non-null object 14 GST 52924 non-null float64 15 Date 52924 non-null object 16 Offline_Spend 52924 non-null float64 17 Online_Spend 52924 non-null float64 18 Month 52955 non-null int64 19 Coupon_Code 52555 non-null object 20 Discount_pct 52555 non-null float64 dtypes: float64(10), int64(2), object(9) memory usage: 8.5+ MB
data_df.duplicated().any()
False
data_df.describe()
| Unnamed: 0 | CustomerID | Tenure_Months | Transaction_ID | Quantity | Avg_Price | Delivery_Charges | GST | Offline_Spend | Online_Spend | Month | Discount_pct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 52955.000000 | 52924.00000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52955.000000 | 52555.000000 |
| mean | 26477.000000 | 15346.70981 | 26.127995 | 32409.825675 | 4.497638 | 52.237646 | 10.517630 | 0.137462 | 2830.914141 | 1893.109119 | 6.652800 | 19.953382 |
| std | 15286.936089 | 1766.55602 | 13.478285 | 8648.668977 | 20.104711 | 64.006882 | 19.475613 | 0.045825 | 936.154247 | 807.014092 | 3.333664 | 8.127108 |
| min | 0.000000 | 12346.00000 | 2.000000 | 16679.000000 | 1.000000 | 0.390000 | 0.000000 | 0.050000 | 500.000000 | 320.250000 | 1.000000 | 10.000000 |
| 25% | 13238.500000 | 13869.00000 | 15.000000 | 25384.000000 | 1.000000 | 5.700000 | 6.000000 | 0.100000 | 2500.000000 | 1252.630000 | 4.000000 | 10.000000 |
| 50% | 26477.000000 | 15311.00000 | 27.000000 | 32625.500000 | 1.000000 | 16.990000 | 6.000000 | 0.180000 | 3000.000000 | 1837.870000 | 7.000000 | 20.000000 |
| 75% | 39715.500000 | 16996.25000 | 37.000000 | 39126.250000 | 2.000000 | 102.130000 | 6.500000 | 0.180000 | 3500.000000 | 2425.350000 | 9.000000 | 30.000000 |
| max | 52954.000000 | 18283.00000 | 50.000000 | 48497.000000 | 900.000000 | 355.740000 | 521.360000 | 0.180000 | 5000.000000 | 4556.930000 | 12.000000 | 30.000000 |
data_df.describe(exclude = np.number)
| Gender | Location | Transaction_Date | Product_SKU | Product_Description | Product_Category | Coupon_Status | Date | Coupon_Code | |
|---|---|---|---|---|---|---|---|---|---|
| count | 52924 | 52924 | 52924 | 52924 | 52924 | 52955 | 52924 | 52924 | 52555 |
| unique | 2 | 5 | 365 | 1145 | 404 | 21 | 3 | 365 | 48 |
| top | F | Chicago | 2019-11-27 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Apparel | Clicked | 11/27/2019 | SALE20 |
| freq | 33007 | 18380 | 335 | 3511 | 3511 | 18126 | 26926 | 335 | 6373 |
Since the there are rows with NaN-values and some columns are not needed for further analysis, we remove those.
# first drop unnecessary columns
data_df.drop(columns = ["Unnamed: 0", "CustomerID", "Coupon_Code", "Product_SKU", "Product_Description", "Transaction_ID"], inplace = True)
data_df.isna().sum()
Gender 31 Location 31 Tenure_Months 31 Transaction_Date 31 Product_Category 0 Quantity 31 Avg_Price 31 Delivery_Charges 31 Coupon_Status 31 GST 31 Date 31 Offline_Spend 31 Online_Spend 31 Month 0 Discount_pct 400 dtype: int64
data_df.dropna(inplace = True)
data_df.isna().sum()
Gender 0 Location 0 Tenure_Months 0 Transaction_Date 0 Product_Category 0 Quantity 0 Avg_Price 0 Delivery_Charges 0 Coupon_Status 0 GST 0 Date 0 Offline_Spend 0 Online_Spend 0 Month 0 Discount_pct 0 dtype: int64
data_df.to_csv("./shopping_data_cleaned_full.csv")